In [1]:
import pandas as pd;
import numpy as np;
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
artists=pd.read_csv("data/artists.csv")

FINILETTI SIMONE, CADONI MATTEO, gruppo Q, Spotify Dataset¶

Artist Dataset¶

In [2]:
artists.sample(5)
Out[2]:
id followers genres name popularity TmpGenres macro genre
359002 3wEWsZuvUW4C3Vuw9tniiZ 285.0 ['aikatsu'] せな・るか from AIKATSU☆STARS! 9 ["'aikatsu'"] alternative
940563 7eBWOD2EmhG5KBFAB5xiSb 85.0 [] Bryon Tosoff 7 [''] alternative
413354 17dzS68S7iPOOgy9NFtPC7 1.0 [] Lorraine Palmer O'Reilly 0 [''] alternative
112405 02VrJiGGcC0lZRC7CgJzpz 0.0 [] Micah Byrns 0 [''] alternative
336950 3PJxzQhkwTuxueC4LC6PLg 6.0 ['classic iskelma'] Aira-Anneli 0 ["'classic iskelma'"] alternative

The most popular genre¶

In [3]:
def categorize():
    artists=pd.read_csv("data/artists.csv")
    artists["TmpGenres"]=artists["genres"].apply(lambda x: x.strip('][').split(','))#convert arrays
    allGeners=np.unique(np.concatenate(artists["TmpGenres"]))
    allGeners[0]="unknown"
    macroGeneres = dict.fromkeys(
        ["alternative", "rock", "metal", "pop", "rap", "punk", "jazz", "reggae", "soul", "polka", "country", "electronic",
         "funk", "hip hop", "r&b", "folk", "house", "techno", "trance", "indie", "blues", "instrument", "electronic",
         "disco", "deep", "hardcore", "wave","trap" ,"other"], 0)
    def getMacroCategory(str):
        counts = dict.fromkeys(macroGeneres.keys(),0)
        for e in str:
            words = e
            for x in macroGeneres.keys():
                if words.count(x)>0:
                  counts[x]+=1
        counts = {key: value for key, value in sorted(counts.items(), key=lambda item: (-item[1],item[0]), reverse=False)}
        #print(counts)
        if len(counts)>0:
            if list(counts.keys())[0] in list(macroGeneres.keys()):
                macroGeneres[list(counts.keys())[0]] += 1
                return list(counts.keys())[0]
            else:
                return "other"
        return "other"
    #Init artists
    artists["TmpGenres"] = artists["genres"].apply(lambda x: x.strip('][').split(','))  #convert arrays

    allGeners = np.unique(np.concatenate(artists["TmpGenres"]))
    allGeners[0] = "unknown"

    import operator



    artists["macro genre"] = artists.apply(lambda z: getMacroCategory(z["TmpGenres"]), axis=1)

    artists.to_csv("data/artists.csv", index=False)
In [4]:
#categorize()

artists=pd.read_csv("data/artists.csv")
In [5]:
genereDF=artists.groupby(["macro genre"],as_index=False).size()

fig=px.histogram(genereDF, y="macro genre", x="size", log_x=True )
fig.update_yaxes(categoryorder="total ascending")
fig.layout["xaxis"]["title"]="Generes"
fig.layout["yaxis"]["title"]="Number of Artists"
fig.layout.yaxis.dtick=0.5
fig.show()

Relationship between followers and popularity¶

In [6]:
tmpData=artists.sample(10000);
fig=px.scatter(tmpData,x="followers",y="popularity",hover_name=tmpData["name"], log_x=True)#
fig.show()

The top 20 of the most popular metal artists¶

In [7]:
metalArtists=artists.loc[artists["macro genre"]=="metal"]
top20Metal=metalArtists.sort_values(by="popularity",ascending=False)
fig = px.histogram(top20Metal[:10],x="popularity",y="name")
fig.update_yaxes(categoryorder="total ascending")
fig.layout["yaxis"]["title"]="Artist"
fig.layout["xaxis"]["title"]="Popularity"
fig.layout.yaxis.dtick=0.5
fig.show()
In [8]:
metalArtists=artists.loc[artists["macro genre"]=="rap"]
top20Metal=metalArtists.sort_values(by="popularity",ascending=False)
fig = px.histogram(top20Metal[:10],x="popularity",y="name")
fig.update_yaxes(categoryorder="total ascending")
fig.layout["yaxis"]["title"]="Artist"
fig.layout["xaxis"]["title"]="Popularity"
fig.layout.yaxis.dtick=0.5
fig.show()

Maximum Popularity per macro-genre¶

In [9]:
maxPerGenere={}

for a in artists["macro genre"].unique():
    tmp=artists.loc[artists["macro genre"]==a]
    if(np.size(tmp)>0):
        maxPerGenere[a]=tmp.sort_values(by="popularity",ascending=False).iloc[0]["popularity"]
In [10]:
fig=px.histogram(pd.DataFrame.from_dict({'Genere':maxPerGenere.keys(),"Popularity":maxPerGenere.values()}).sort_values(by="Popularity",ascending=False),x="Genere",y="Popularity")
fig.update_yaxes(categoryorder="total ascending")
fig.layout["yaxis"]["title"]="Popularity"
fig.layout["xaxis"]["title"]="Genre"
fig.show()
In [ ]: